setwd("/Users/ginny/Downloads/Rgraphics")
housing = read.csv("dataSets/landdata-states.csv")
head(housing [1:5])
## State region Date Home.Value Structure.Cost
## 1 AK West 2010.25 224952 160599
## 2 AK West 2010.50 225511 160252
## 3 AK West 2009.75 225820 163791
## 4 AK West 2010.00 224994 161787
## 5 AK West 2008.00 234590 155400
## 6 AK West 2008.25 233714 157458
#####################################simple plots
hist(housing$Home.Value)
###using ggplot2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
ggplot(housing, aes(x=Home.Value))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
######################################complex plots
plot(Home.Value ~ Date,
data=subset(housing, State == "MA"))
points(Home.Value ~ Date, col="red",
data=subset(housing, State == "TX"))
legend(1975, 400000,
c("MA", "TX"), title="State",
col=c("black", "red"),
pch=c(1, 1))
###using ggplot2
ggplot(subset(housing, State %in% c("MA","TX")),
aes(x=Date, y=Home.Value, color=State))+
geom_point()
## aesthetic mapping
## geometic objects
help.search("geom_",package="ggplot2")
hp2001Q1=subset(housing, Date==2001.25)
ggplot(hp2001Q1,
aes(y=Structure.Cost, x=Land.Value))+
geom_point()
ggplot(hp2001Q1,
aes(y = Structure.Cost, x = log(Land.Value))) +
geom_point()
## Lines
hp2001Q1$pred.SC =predict(lm(Structure.Cost ~ log(Land.Value), data=hp2001Q1))
p1= ggplot(hp2001Q1, aes(x=log(Land.Value), y=Structure.Cost))
p1+geom_point(aes(color=Home.Value))+geom_line(aes(y=pred.SC))
####the attributes in aes() should be subset of the dataframe's attributes
p1+geom_point(aes(color=Home.Value))+
geom_smooth()
## `geom_smooth()` using method = 'loess'
p1+geom_text(aes(label=State),size=3)
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 3.3.2
p1+
geom_point()+
geom_text_repel(aes(label=State),size=3)
Variables are mapped to aesthetics with the aes() function, while fixed aesthetics are set outside the aes() call.
p1+
geom_point(aes(size=2),
color="red")
p1+
geom_point(aes(color=Home.Value, shape=region))
## Warning: Removed 1 rows containing missing values (geom_point).
setwd("/Users/ginny/Downloads/Rgraphics")
dat=read.csv("dataSets/EconomistData.csv")
ggplot(dat, aes(x = CPI, y = HDI)) + geom_point()
ggplot(dat, aes(x = CPI, y = HDI)) + geom_point(color="blue")
ggplot(dat, aes(x = CPI, y = HDI)) + geom_point(aes(color=Region))
ggplot(dat, aes(x = CPI, y = HDI)) + geom_point(aes(color=Region),size=2)
ggplot(dat, aes(x = CPI, y = HDI)) + geom_point(aes(color=Region,size=HDI.Rank))
boxplots, histograms and prediction lines etc require statistical transformation
for boxplot, the y value must be transformed to the median and 1.5
args(geom_histogram)
## function (mapping = NULL, data = NULL, stat = "bin", position = "stack",
## ..., binwidth = NULL, bins = NULL, na.rm = FALSE, show.legend = NA,
## inherit.aes = TRUE)
## NULL
args(stat_bin)
## function (mapping = NULL, data = NULL, geom = "bar", position = "stack",
## ..., binwidth = NULL, bins = NULL, center = NULL, boundary = NULL,
## breaks = NULL, closed = c("right", "left"), pad = FALSE,
## na.rm = FALSE, show.legend = NA, inherit.aes = TRUE)
## NULL
p2=ggplot(housing, aes(x=Home.Value))
p2+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p2+geom_histogram(stat="bin", binwidth = 4000)
housing.sum=aggregate(housing["Home.Value"],by=housing["State"], FUN=mean)
rbind(head(housing.sum),tail(housing.sum))
## State Home.Value
## 1 AK 147385.14
## 2 AL 92545.22
## 3 AR 82076.84
## 4 AZ 140755.59
## 5 CA 282808.08
## 6 CO 158175.99
## 46 VA 155391.44
## 47 VT 132394.60
## 48 WA 178522.58
## 49 WI 108359.45
## 50 WV 77161.71
## 51 WY 122897.25
ggplot(housing.sum, aes(x=State, y=Home.Value))+
geom_bar(stat="identity")
ggplot(dat, aes(x = CPI, y = HDI)) + geom_point()
p3=ggplot(dat, aes(x = CPI, y = HDI)) + geom_point()
p3+
geom_smooth()
## `geom_smooth()` using method = 'loess'
p3=ggplot(dat, aes(x = CPI, y = HDI)) + geom_point()
p3+
geom_smooth(method="lm")
p3=ggplot(dat, aes(x = CPI, y = HDI))
dat$ps<- predict(lm(HDI ~ CPI, data = dat))
p3 +
geom_point() +
geom_line(aes(y=dat$ps))
p3=ggplot(dat, aes(x = CPI, y = HDI)) + geom_point()
p3+
geom_smooth(span=0.2)+
geom_smooth(span=1,color="red")
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'
to control aesthetic mapping
name, limits, breaks, labels
p4=ggplot(housing,
aes(x=State,
y=Home.Price.Index))+
theme(legend.position="top",
axis.text=element_text(size=6))
p5=p4+geom_point(aes(color=Date),
alpha=0.5, ###alpha is the transparency value
size=1.5,
position=position_jitter(width=0.1,height =0))
p5+scale_x_discrete(name="State Abbreviation")+
scale_color_continuous(name="",
breaks=c(1976,1994,2013),
labels=c("'76","'94","'13"))
p5+scale_x_discrete(name="State Abbreviation")+
scale_color_continuous(name="",
breaks=c(1976,1994,2013),
labels=c("'76","'94","'13"),
low="blue",high="red")
library(reshape2)
library(scales)
## Warning: package 'scales' was built under R version 3.3.2
p5+scale_x_discrete(name="State Abbreviation")+
scale_color_continuous(name="",
breaks=c(1976,1994,2013),
labels=c("'76","'94","'13"),
low=muted("blue"),high=muted("red"))
p6=ggplot(dat, aes(x=CPI,y=HDI))
p6+
geom_point(aes(color=Region))
p6=ggplot(dat, aes(x=CPI,y=HDI))
p6+
geom_point(aes(color=Region))+
scale_x_discrete(name="Corruption Perception Index")+
scale_y_discrete(name="Human Development Index")
p6=ggplot(dat, aes(x=CPI,y=HDI))
p6+
geom_point(aes(color=Region))+
scale_x_discrete(name="Corruption Perception Index")+
scale_y_discrete(name="Human Development Index")+
scale_color_manual(values=c("red","blue","green","black","grey","orange"))
faceting is ggplot2 parlance for small multiples
the idea is to create seperate graphs for subsets of data
facet_wrap and facet_grid
facilites comparision among plots, not just geoms within a plot
p_5=ggplot(housing, aes(x=Date, y=Home.Value))
p_5+geom_line(aes(color=State))
two problems here, too many states to distinguish by color and the lines obscure one another
p_5=ggplot(housing, aes(x=Date, y=Home.Value))
p_5+geom_line()+facet_wrap(~State,ncol=10)
p_5=ggplot(housing, aes(x=Date, y=Home.Value))+geom_line()+facet_wrap(~State,ncol=10)
p_5+theme_linedraw()
p_5+theme_light()
p_5=ggplot(housing, aes(x=Date, y=Home.Value))+geom_line()+facet_wrap(~State,ncol=10)
p_5+theme_minimal()+
theme(text=element_text(color="turquoise"))
housing.byyear=aggregate(cbind(Home.Value, Land.Value)~ Date, data=housing, mean)
ggplot(housing.byyear,
aes(x=Date))+
geom_line(aes(y=Home.Value), color="red")+
geom_line(aes(y=Land.Value), color="blue")
###right version
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
home.land.byyear=gather(housing.byyear, value="value", key="type",
Home.Value, Land.Value)
###add one more variable called type.
ggplot(home.land.byyear, aes(x=Date, y=value,
color=type))+
geom_line()
####a bit of tidyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
stocks=data_frame(time=as.Date('2009-01-01')+0:9,
X=rnorm(10,0,1),
Y=rnorm(10,0,2),
Z=rnorm(10,0,4))
gather(stocks, stock, price, -time)
## # A tibble: 30 x 3
## time stock price
## <date> <chr> <dbl>
## 1 2009-01-01 X -0.6183968
## 2 2009-01-02 X -1.8399555
## 3 2009-01-03 X -0.5451043
## 4 2009-01-04 X 0.4296350
## 5 2009-01-05 X 1.1010343
## 6 2009-01-06 X 1.0269384
## 7 2009-01-07 X -0.7113512
## 8 2009-01-08 X -0.5454103
## 9 2009-01-09 X -1.2744525
## 10 2009-01-10 X 0.1565849
## # ... with 20 more rows
tid=stocks %>%gather(stock, price, -time) ###-time meaning get everything other than time
#putting it all together
###it seems r square should be calculated by myself
library(ggrepel)
model=lm(HDI~poly(CPI,2), dat)
dat$pred=predict(model, x=dat$CPI)
sm=summary(model)
r_s=sm$r.squared
dat$label=dat$Country
important_countries=c("Italy","France","Greece","Russia","Spain", "Norway","New Zealand","US","China","South Africa","Venezuela","Congo","Rwanda","India","Botswana","Germany","Singapore","Japan","Sudan","Iraq","Brazil","Afghamstan","Myanmar","Bhutan","Britain")
muted_which=which(dat$Country%in%important_countries==F)
dat$label[muted_which]=""
## Warning in `[<-.factor`(`*tmp*`, muted_which, value = structure(c(NA, NA, :
## invalid factor level, NA generated
p6=ggplot(dat, aes(x=CPI,y=HDI))
p6+
geom_line(aes(y=pred),size=1,color="red")+
geom_point(aes(color=Region),size=4,shape=21,stroke=1.5,fill="white")+
geom_text_repel(aes(label=label), size = 4,point.padding = unit(0.3,"lines"))+
scale_color_manual(values=c("dodgerblue","darkturquoise","seagreen","deepskyblue4","brown1","brown"))+
scale_x_continuous(name="Corruption Perception Index",limits = c(1,10),breaks=seq(1:10))+
scale_y_continuous(name="Human Development Index",limits=c(0.2,1.0))+
theme(legend.position = "top" ,
legend.key = element_rect(fill="white"),
legend.title = element_blank(),
legend.text = element_text(size=12),
axis.title = element_text(size=12),
axis.text = element_text(size=10),
panel.background = element_rect(colour = "white",fill="white"),
panel.grid.major.y =element_line(size = 0.1,colour = "grey"),
plot.title = element_text(face="bold",size=16, hjust = 0))+
guides(colour = guide_legend(nrow = 1))+
ggtitle("Corruption and human development")
## Warning: Removed 150 rows containing missing values (geom_text_repel).